Python Data Analysis Library


faqs

setup

pip install pandas
import pandas as pd
pd.__version__
#  '1.1.0'

Series

a one-dimensional labeled array capable of holding any data type

create

pd.Series(data, index, dtype, copy)

pd.Series() # empty series, dtype float64 by default

# from scalar
pd.Series(0) # dtype int64, integer index by default 
pd.Series(0, ['e0', 'e1', 'e2']) # broadcasts the scalar across all elemnts

# from list
pd.Series([1, 2, 3])
pd.Series([1, 2, 3], ['e0', 'e1', 'e2']) 

# from dict
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}) # keys as index by default
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}, ['e0', 'e1']) # ignore e2
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}, ['e0', 'e1', 'e2', 'e3']) # include e3 as NaN

# from ndarray
pd.Series(np.array([1, 2, 3]), ['e0', 'e1', 'e2']) 

Things to note:

  • you can use a mixture of types for the values and/or index within the same series - if you mixture of types for the values then the dtype of the series will be set to object
  • you can use any hashable values for the index
  • index values don’t have to be unique
  • None gets converted to NaN for dtype float and dtype int
  • None stays as None for dtype object

attributes

ser.shape
ser.count
ser.index
ser.index.name
len(ser)

indexing and slicing

subset by index

  • can use square brackets - e.g. ser['e0'] - but it’s safer to be explicit and use ser.loc
ser.loc['e0'] # returns a scalar (if the index value is unique)
ser.loc[['e0']]
ser.loc[['e0','e2']] 
ser.loc[:'e1'] # up to and INCLUDING 'e1'
ser.loc['e1':] # from and including 'e1'

subset by position

  • can use square brackets - e.g. ser[0] - but it’s safer to be explicit and use ser.iloc
ser.iloc[0] # returns a scalar
ser.iloc[-1] # returns a scalar
ser.iloc[[0]]
ser.iloc[[-1]]
ser.iloc[[0, 2]] 
ser.iloc[:2] # up to and EXCLUDING position 2 (i.e. 0 and 1 only)
ser.iloc[2:] # from and including position 2 (i.e. 2 onward)

subset by boolean mask

# TODO

casting

conversion

list(x)
ser.to_list()

dict(x)
ser.to_dict()

ser.to_numpy()

DataFrame

a two-dimensional labeled data structure with columns of potentially different types

create

pd.DataFrame(data, index, columns, dtype, copy)

# create empty df
df = pd.DataFrame()

from dict

each key is a column

# from dict of lists - each key is a column
pd.DataFrame({'c1': ['x', 'y', 'z'], 
              'c2': [1, 2, 3]},
             index=['r1', 'r2', 'r3'])

# from dict of series - each key is a column
pd.DataFrame({'c1': pd.Series(['x', 'y', 'z'], index=['r1', 'r2', 'r3']),
              'c2': pd.Series([1, 2, 3], index=['r1', 'r2', 'r3'])})

from list

each element is a row

pd.DataFrame([1, 2, 3])
pd.DataFrame([1, 2, 3], columns=['c1'])

# from list of lists
pd.DataFrame([['x', 1], 
              ['y', 2], 
              ['z', 3]],
             index=['r1', 'r2', 'r3'],
             columns=['c1', 'c2'])

# from list of series
pd.DataFrame([pd.Series(['x', 1], ['c1', 'c2']),
              pd.Series(['y', 2], ['c1', 'c2']),
              pd.Series(['z', 3], ['c1', 'c2'])],
             index=['r1', 'r2', 'r3'])
             
# from list of dicts
pd.DataFrame([{'c1': 'x', 'c2': 1},
              {'c1': 'y', 'c2': 2},
              {'c1': 'z', 'c2': 3}],
              index=['r1', 'r2', 'r3'])

attributes

df.shape # (nrows, ncols) tuple
df.head # first 5 rows
df.tail # last 5 rows
df.info
df.dtypes
df.columns
df.index
df.columns.values
df.columns.name # not always
df.columns.names # not always
df.index.values
df.index.name # not always
df.index.names # not always
len(df) # nrows

types

e.g.
object
float64
int64
datetime

indexing and slicing

subset columns by name

  • simple square bracket indexing is reserved for column selection
  • always label-based since columns in a df will always have a label
df['c1'] # returns a series
df[['c1']]
df[['c1', 'c2']]

Equivalently (and less ambiguously),

df.loc[:, 'c1']  # returns a series
df.loc[:, ['c1']]
df.loc[:, ['c1', 'c2']]

subset columns by position

df.iloc[:, 0]  # returns a series
df.iloc[:, [0]]
df.iloc[:, [0, 1]]

subset rows by index

df.loc['r1'] # first row - returns a series
df.loc['r3'] # last row - returns a series
df.loc[['r1']] # first row
df.loc[['r3']] # last row

subset rows by position

df.iloc[0] # first row - returns a series
df.iloc[-1] # last row - returns a series
df.iloc[[0]] # first row
df.iloc[[-1]] # last row

boolean masking

list of booleans:

df.loc[:, [True, False]]
df.loc[:, [True, True]]

boolean series:

# filter rows
df[df['c1'] == "x"] 
df[(df['c1'] == "x") | (df['c1'] == "y")] # bitwise boolean OR operator 
df.loc[df['c1'].isin(['x', 'y'])]

multi-level indexing

# TODO

view vs copy

x = df['c2'] # returns a view
x.to_numpy().flags.owndata # returns False
x[1] = 99999 # triggers a SettingWithCopyWarning warning
df # has been modified...
y = df['c2'][['r1', 'r2', 'r3']] # returns a copy
y.to_numpy().flags.owndata # returns True
y[1] = -99999 # no warnings
df # has not been modified

join

merge

defaults:
* how='inner'
* on=None
* left_on=None
* right_on=None
* left_index=False
* right_index=False

pd.merge(df1, df2, on='c1', how='left')
pd.merge(df1, df2, on='c1', how='right')
pd.merge(df1, df2, on='c1', how='inner')
pd.merge(df1, df2, on='c1', how='outer')

# e.g. inner join on different column names
pd.merge(df1, df2, left_on='c1', right_on='c2')

# e.g. inner join on two column names
pd.merge(df1, df2, left_on=['c1', 'c2'], right_on=['c1', 'c2'])

# e.g. inner join on indexes
pd.merge(df1, df2, left_index=True, right_index=True)

stack

concat


wrangling and eda

append

s1 = pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
s1.append(s2) # does not modify in-place

apply

  • can be applied to Series or DataFrame
# e.g. apply a function element-wise
ser.apply(np.sqrt)
df.apply(np.sqrt)

e.g. apply a function row/column-wise that returns a scalar

df.apply(np.sum) # column sums (i.e. apply across rows)
df.apply(np.sum, axis=1) # row sums (i.e. apply across columns)

e.g. apply a function row-wise that returns a list

fun = lambda x: [1, 2]
df.apply(fun, axis=1)
df.apply(fun, axis=1, result_type='expand') # expands results as unnamed columns

e.g. apply a function row-wise that returns a dict

fun = lambda x: dict(i=x[0], j=x[1])
df.apply(fun, axis=1)
df.apply(fun, axis=1, result_type='expand') # expands results as named columns

applymap

  • can only be applied to DataFrames
  • function is applied on every element individually
# TODO

astype

df['c1'] = df['c1'].astype(int)
df[["col1", "col2"]] = df[["col1", "col2"]].astype(float)

corr

calculate pair-wise correlations for numeric variables.

df.corr()
df[["col1", "col2"]].corr()

cut

labels = ['high', 'med', 'low']
bins = np.linspace(df['c1'].min(), df['c1'].max(), len(labels) + 1)
df['binned'] = pd.cut(df['c1'], bins, labels=labels, include_lowest=True)

from matplotlib import pyplot
pyplot.bar(labels, df["binned"].value_counts())

describe

df.describe() # numerical variables only
df.describe(include_all=True)

drop

defaults:
* axis=0
* inplace=False

df.drop('r1')
df.drop('c1', axis = 1)
df.drop('c1', axis = 1, inplace=True)

dropna

defaults:
* axis=0
* inplace=False
* how='any'

df.dropna() # drop rows with any missing
df.dropna(subset=['c1', 'c2']) # drop rows with any missing in column subset
df.dropna(axis=1) # drop columns with any missing
df.dropna(axis=1, how='all') # drop columns with all missing
df.dropna(axis=1, how='all', inplace=True)

drop_duplicates

# remove duplicate rows based on all columns - keep first occurrence
df.drop_duplicates()

# remove duplicate rows based on all columns - keep last occurrence
df.drop_duplicates(keep='last')

# remove duplicate rows based on all columns - remove all duplicates
df.drop_duplicates(keep=False)

# remove duplicate rows based on specific columns
df.drop_duplicates(subset=['c1', 'c2'])

# modify in-place
df.drop_duplicates(inplace=True)

fillna

df.fillna(0)
df.fillna(method='bfill', axis=0) # fill using previous non-missing in row
df.fillna(method='ffill', axis=1) # fill using next non-missing in column

get_dummies

dummies = pd.get_dummies(df['c1'])
df = pd.concat([df, dummies], axis=1)
df.drop('c1', axis = 1, inplace=True)
df

groupby

  • create a grouped object that can be iterated upon
    • items are tuples:
      • first element is group condition
      • second element is data frame chunk
import pandas as pd
df = pd.DataFrame({'c1': ['x', 'y', 'z', 'z'], 
                   'c2': [1, 2, 3, 4],
                   'c3': [2, 3, 4, 5]},
                  index=['r1', 'r2', 'r3', 'r4'])
df_grouped = df.groupby(['c1'])
list(df_grouped.groups.keys())
#  ['x', 'y', 'z']
df_grouped.get_group("x")
#     c1  c2  c3
#  r1  x   1   2
df_grouped.get_group("y")
#     c1  c2  c3
#  r2  y   2   3
df_grouped.get_group("z")
#     c1  c2  c3
#  r3  z   3   4
#  r4  z   4   5
for group, frame in df_grouped:
  print(group, ":", frame.shape)
#  x : (1, 3)
#  y : (1, 3)
#  z : (2, 3)

aggregate

groupings as index:

df_grouped = df.groupby(['c1'])

# returns df of group means for all columns
df_grouped.mean() 
df_grouped.agg("mean")
df_grouped.agg(np.mean)

# returns df of group means for specified columns
df_grouped.agg({'c2': np.mean}) # returns df
df_grouped[['c2']].mean() # returns df
df_grouped['c2'].mean() # returns series
df_grouped['c2'].agg("mean") # returns series

default integer index and groupings to remain as columns:

df_grouped = df.groupby(['c1'], as_index=False)
# as above

misc

# where fun is a function that gets applied to the index values of df
def fun(item):
  if item > 5:
    return 'group1'
  else:
    return 'group2'
df.groupby(fun) 

idxmax

# TODO

map

  • can only be applied to Series objects
import pandas as pd
df = pd.DataFrame(dict(c1=[1,2,3]))
df
#     c1
#  0   1
#  1   2
#  2   3
mapper = dict(zip([1,2,3], ['r1','r2','r3']))
df['c1'] = df['c1'].map(mapper)
df
#     c1
#  0  r1
#  1  r2
#  2  r3

melt

# TODO

pivot

I.e. pivot wider (spread)

# e.g. col1 as rownames, col2 as colnames, col3 as values
df=df[['c1', 'c2', 'c3']]
df.pivot(index='c1', columns='c2')

pivot_table

  • a generalization of pivot that can handle duplicate values for one pivoted index/column pair

  • can supply a list of aggregation functions using aggfunc argument (default is np.mean).

  • can handle multiple columns for the index and column of the pivoted table (in which case a hierarchical index is generated).

# TODO

plot

# hist
count, bin_edges = np.histogram(df[['x1', 'x2']])
df[['x1', 'x2']].plot(kind='hist', 
                      xticks=bin_edges,
                      bins=15,
                      alpha=0.35,
                      color=['coral', 'darkslateblue'],
                      stacked=True,  # if more than one xvar
                      xlim=(xmin, xmax))

# scatter
df.plot(x='xvar', y='yvar', kind='scatter'
                    alpha=0.5,                  # transparency
                    color='green',
                    s=10,  # size weight (can be a list)
                    xlim=(0, 100))

# line
df[['y1', 'y2']].plot(kind='line') # index is x-axis

# area (assumes wide format, x = index)
df.plot(kind='area', 
       stacked=False,
       alpha=0.35,
       figsize=(20, 10))
method description
bar vertical bar plots
barh horizontal bar plots
hist histogram
box boxplot
kde or density density plots
area area plots
pie pie plots
scatter scatter plots
hexbin hexbin plot

annotations

# using plt
plt.title('title') 
plt.ylabel('ylabel') 
plt.xlabel('xlabel') 

# using ax
ax = df.plot(kind='asdf')
ax.set_title('title')
ax.set_ylabel('ylabel')
ax.set_xlabel('xlabel')

rename

defaults:
* axis=0
* inplace=False

df.rename(dict(r1='r99')) # maps rows by default
df.rename(columns=dict(c1='c999'))
df.rename(mapper=dict(c1='c999'), axis=1)

replace

df['c1'].replace(np.nan, df['c1'].mean())
df['Gender'].replace(to_replace=['male','female'], value=[0,1], inplace=True)

rename_axis

Add/replace axis name.

df.rename_axis('new_index_name') 
df.rename_axis('new_index_name', inplace=True) 
df.rename_axis('new_column_name', axis=1) 
df.rename_axis('new_column_name', axis=1, inplace=True) 

reset_index

Set the index to 0, 1, 2, …

df.reset_index() # adds the old index as a new column
df.reset_index(inplace=True)
df.reset_index(drop=True) # doesn't add the old index as a new column
df.reset_index(drop=True, inplace=True)

set_index

Use a column/columns as the index

df.set_index('c1') # destroys the old index
df.set_index('c1', drop=False) # doesn't destroy the old index
df.set_index('c1', inplace=True)

multi-level indexing:

df.set_index([df.index, 'c1'])
df.set_index(['c1', 'c2'])

sort_values

df.sort_values(by='c1')
df.sort_values(by='c1', ascending=False)
df.sort_values(by='c1', ascending=False, na_position='first')
df.sort_values(by='c1', ascending=False, na_position='first', inplace=True)

Multiple columns

df.sort_values(['c1', 'c2'], ascending=[True, False])

str.split

import pandas as pd
df = pd.DataFrame(dict(c1 = ['1,2,3', '4,5,6', '7,8,9']))
df
#        c1
#  0  1,2,3
#  1  4,5,6
#  2  7,8,9
df['c1'].str.split(',', expand=True)
#     0  1  2
#  0  1  2  3
#  1  4  5  6
#  2  7  8  9

to_datetime

# TODO

to_string

df.to_string(float_format='{:.6f}'.format, # e.g. print floats using 6dp
             index=False, 
             header=False)

transpose

cols to rows and rows to cols

df.transpose() # method
df.T # an attribute

unique

df['c1'].unique()

value_counts

df['c1'].value_counts()
df['c1'].value_counts(normalize=True)
df['c1'].value_counts().to_frame() # convert series to dataframe

where

df.where(condition) # replace with NaN where condition is False
df.where(df['c1'] > 10)
df.where(df['c1'] > 10, inplace=True)

wide_to_long

  • Less flexible but more user-friendly than melt.
# TODO

read

read_csv

pd.read_csv("file.csv")
pd.read_csv('file.csv', skiprows=1) # ignore first row and use row 2 for column names
pd.read_csv("file.csv", header=None, names=['c1', 'c2', 'c3'])
pd.read_csv('file.csv', index_col = 0) # use first col as index

chunking

# TODO

read_json

df = pd.read_json("file.json")

read_sql

df = pd.read_sql(query, con)

read_excel

pip install xlrd
df = pd.read_excel("file.xlsx",
                   sheet_name='Canada by Citizenship',
                   skiprows=range(20),
                   skipfooter=2)

write

to_csv

df.to_csv("file.csv")

to_json

df.to_json("file.json")

to_sql

df.to_sql("file.sql")

to_excel

df.to_excel("file.xlsx")

display

with pd.option_context(

):
  print(df)
# TODO

Create from scratch, lists, NumPy arrays, dicts, series or other data frames
.shape, len(), .index, .count, .columns, .columns.values
.loc, .iloc, .at, .iat, .ix
set_index to use a column as an index
use column as the index df['colname'] = df.index
the general recommendation is that you use .loc to insert rows in your DataFrame
drop
read_csv, parse_dates
.to_csv, .to_excel
pivot, pivot_table, stack, unstack, melt
.iterrows

drop_cols=['thal','thalach']
heart_Pr=pd.read_csv('heart.csv',usecols=lambda cc : cc not in drop_cols, index_col=False)```

import pandas as pdheart_P=pd.read_csv('heart.csv')#print first rowsprint(heart_P.head(5))#Print some basic info about the dataframeheart_P.describe()#Count grouped casesheart_P.groupby('fbs').count()#Count the mean of grouped casesheart_P.groupby(['sex','fbs']).count()heart_P.groupby(['sex','fbs']).mean()

#Select first row. Returns Series.print(ChildAgeFrame.iloc[0])#Select first row. Returns Dataframe.print(ChildAgeFrame.iloc[[0]])#Select first columnprint(ChildAgeFrame.iloc[:,0])#Select last columnprint(ChildAgeFrame.iloc[:,-1])#Select first elementprint(ChildAgeFrame.iloc[0,0])#Select first 2 rowsprint(ChildAgeFrame.iloc[0:2])#Select first 2 columnsprint(ChildAgeFrame.iloc[:,0:2])

#Select columns using nameprint(heart_P[['chol','fbs']])#Select row using regular expressionsdf=heart_P.filter(regex='t.+l',axis=1)#Select row using logical conditiondf=heart_P[(heart_P['chol']> 200) & (heart_P['thalach']==178)]

HotelFrame = pd.DataFrame({'Room Type': ['Regular','Suite', 'Regular','Lux Suite','Suite'],'Floor':['1','2','3','3','2']})pets=['No','Yes','No','No','Yes']HotelFrame['pet']=pets#Using a map to implement logicpets_allowed={'Regular':'No','Suite':'Yes','Lux Suite':'No'}#Add column using the above mapHotelFrame['PETS']=HotelFrame['Room Type'].map(pets_allowed)#ADD a ROWdfadd=HotelFrame.append({'Room Type':'Regular','Floor':'1','pet':'No','PETS':'No'}, ignore_index=True)print(dfadd)

ChildAgeFrame = pd.DataFrame(np.array(([15, 0], [69,3],[35,2])), columns=['age', 'children'], index=['Tes', 'Linda', 'Kate'])#Compute the mean of the columnsprint(ChildAgeFrame.apply(np.mean,axis=0))#Apply a lambda function to all elementsprint(ChildAgeFrame.apply(lambda x: x*5))